import os
import gc
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from pandas_profiling import ProfileReport
%matplotlib inline
# Configuration
warnings.simplefilter('ignore')
pd.options.display.max_rows = 999
sns.set_style("darkgrid")
colors = sns.color_palette('Set2')
import matplotlib.font_manager
plt.rcParams.update({'font.family':'sans-serif','font.size':14})
df = pd.read_csv('Sample_EDA_Dataset.csv',encoding = 'unicode_escape', engine ='python')
df = pd.DataFrame(df)
df.duplicated(keep=False).sum()
0
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 51290 entries, 0 to 51289 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 51290 non-null int64 1 Order ID 51290 non-null object 2 Order Date 51290 non-null object 3 Ship Date 51290 non-null object 4 Ship Mode 51290 non-null object 5 Customer ID 51290 non-null object 6 Customer Name 51290 non-null object 7 Segment 51290 non-null object 8 City 51290 non-null object 9 State 51290 non-null object 10 Country 51290 non-null object 11 Postal Code 9994 non-null float64 12 Market 51290 non-null object 13 Region 51290 non-null object 14 Product ID 51290 non-null object 15 Category 51290 non-null object 16 Sub-Category 51290 non-null object 17 Product Name 51290 non-null object 18 Sales 51290 non-null float64 19 Quantity 51290 non-null int64 20 Discount 51290 non-null float64 21 Profit 51290 non-null float64 22 Shipping Cost 51290 non-null float64 23 Order Priority 51290 non-null object dtypes: float64(5), int64(2), object(17) memory usage: 9.4+ MB
df['Postal Code'] = df['Postal Code'].astype('object')
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Row ID | 51290.0 | 25645.500000 | 14806.291990 | 1.000 | 12823.250000 | 25645.500 | 38467.7500 | 51290.000 |
| Sales | 51290.0 | 246.490581 | 487.565361 | 0.444 | 30.758625 | 85.053 | 251.0532 | 22638.480 |
| Quantity | 51290.0 | 3.476545 | 2.278766 | 1.000 | 2.000000 | 3.000 | 5.0000 | 14.000 |
| Discount | 51290.0 | 0.142908 | 0.212280 | 0.000 | 0.000000 | 0.000 | 0.2000 | 0.850 |
| Profit | 51290.0 | 28.610982 | 174.340972 | -6599.978 | 0.000000 | 9.240 | 36.8100 | 8399.976 |
| Shipping Cost | 51290.0 | 26.375915 | 57.296804 | 0.000 | 2.610000 | 7.790 | 24.4500 | 933.570 |
df.describe(include=object).transpose()
| count | unique | top | freq | |
|---|---|---|---|---|
| Order ID | 51290 | 25035 | CA-2014-100111 | 14 |
| Order Date | 51290 | 1430 | 18-06-2014 | 135 |
| Ship Date | 51290 | 1464 | 22-11-2014 | 130 |
| Ship Mode | 51290 | 4 | Standard Class | 30775 |
| Customer ID | 51290 | 1590 | PO-18850 | 97 |
| Customer Name | 51290 | 795 | Muhammed Yedwab | 108 |
| Segment | 51290 | 3 | Consumer | 26518 |
| City | 51290 | 3636 | New York City | 915 |
| State | 51290 | 1094 | California | 2001 |
| Country | 51290 | 147 | United States | 9994 |
| Postal Code | 9994.0 | 631.0 | 10035.0 | 263.0 |
| Market | 51290 | 7 | APAC | 11002 |
| Region | 51290 | 13 | Central | 11117 |
| Product ID | 51290 | 10292 | OFF-AR-10003651 | 35 |
| Category | 51290 | 3 | Office Supplies | 31273 |
| Sub-Category | 51290 | 17 | Binders | 6152 |
| Product Name | 51290 | 3788 | Staples | 227 |
| Order Priority | 51290 | 4 | Medium | 29433 |
df.isnull().sum().sort_values(ascending=False)
Postal Code 41296 Row ID 0 Region 0 Shipping Cost 0 Profit 0 Discount 0 Quantity 0 Sales 0 Product Name 0 Sub-Category 0 Category 0 Product ID 0 Market 0 Order ID 0 Country 0 State 0 City 0 Segment 0 Customer Name 0 Customer ID 0 Ship Mode 0 Ship Date 0 Order Date 0 Order Priority 0 dtype: int64
corre = df.loc[:, df.columns!='Row ID'].corr()
corre
| Sales | Quantity | Discount | Profit | Shipping Cost | |
|---|---|---|---|---|---|
| Sales | 1.000000 | 0.313577 | -0.086722 | 0.484918 | 0.768073 |
| Quantity | 0.313577 | 1.000000 | -0.019875 | 0.104365 | 0.272649 |
| Discount | -0.086722 | -0.019875 | 1.000000 | -0.316490 | -0.079056 |
| Profit | 0.484918 | 0.104365 | -0.316490 | 1.000000 | 0.354441 |
| Shipping Cost | 0.768073 | 0.272649 | -0.079056 | 0.354441 | 1.000000 |
df['Category'].unique()
array(['Technology', 'Furniture', 'Office Supplies'], dtype=object)
quant_cat = df.groupby(['Category']).Quantity.sum().sort_values(ascending=False)
sns.barplot(x=quant_cat,y=quant_cat.index)
plt.show()
quant_cat_sub = df.groupby(['Category','Sub-Category']).Quantity.sum().reset_index().sort_values(by=['Category','Quantity'], ascending=[True,False], key=lambda x : x.replace({'Office Supplies':1,'Furniture':2,'Technology':3}))
plt.figure(figsize=(12,8))
sns.barplot(data=quant_cat_sub,y='Sub-Category',x='Quantity',hue='Category')
plt.title('Sub-Category wise Quantity sold',fontdict={'fontsize':22,'family':'sans serif','color':'darkred'})
plt.xlabel('Total Quantity',fontdict={'fontsize':18,'family':'sans serif','color':'darkred'})
plt.ylabel('Sub-Category',fontdict={'fontsize':18,'family':'sans serif','color':'darkred'})
plt.show()
df_discount = df[df.Discount != 0]
disc_cat = df_discount.groupby('Category')['Discount'].mean().sort_values(ascending=False)
sns.barplot(x=disc_cat,y=disc_cat.index)
plt.show()
plt.figure(figsize=(16,8))
ax1, ax2 = plt.subplot(1,2,1), plt.subplot(1,2,2)
sns.violinplot(data=df,x='Discount',y='Category',inner='box',ax=ax1)
ax1.set_title('For all the items')
sns.violinplot(data=df_discount,x='Discount',y='Category',inner='box',ax=ax2,sharex=ax1)
ax2.set_title('Only for the discounted items')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.show()
print(df.iloc[df_discount.Discount.idxmax()][['Segment','Category','Sub-Category','Quantity','Discount','Profit']])
Segment Home Office Category Furniture Sub-Category Tables Quantity 4 Discount 0.85 Profit -1924.542 Name: 5321, dtype: object
def sales_profit(col,filter_col=None,filter_condition=None,n=5):
if (filter_col == None) | (filter_condition == None):
new_df = df
a = ''
else:
new_df = df[df[filter_col]==filter_condition]
a = 'In {} as {},'.format(filter_col,filter_condition)
print('Sales Contribution by {}\n'.format(col))
print(new_df[col].value_counts().head(10))
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(nrows = 2, ncols = 4, width_ratios=(2, 2, 2, 2), height_ratios=(2, 2),
left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.8, hspace=0.2)
font_title ={'fontsize':20,'family': 'sans-serif','color': 'darkred','weight': 'normal','size': 20}
ax_pie = fig.add_subplot(gs[0, 1:3])
sales = (new_df.groupby(col).Sales.sum().sort_values(ascending=False)*100/new_df.Sales.sum())
if len(sales) > 10:
sns.barplot(x=sales.head(15),y=sales.head(15).index,ax=ax_pie)
ax_pie.set_title('{} \nTop 15 Revenue contributing {}'.format(a,col),)#fontdict=font_title)
elif len(sales) > 5:
ax_pie.pie(sales,autopct='%.2f%%',textprops={'fontsize':16,'color':'white'})
ax_pie.set_title('{} \nRevenue contribution by {}'.format(a,col),)#fontdict=font_title)
ax_pie.legend(bbox_to_anchor=(1,1))
else:
ax_pie.pie(sales,autopct='%.2f%%',labels=sales.index,textprops={'fontsize':16,'color':'white'})
ax_pie.set_title('{} \nRevenue contribution by {}'.format(a,col),)#fontdict=font_title)
ax_pie.legend(bbox_to_anchor=(1,1))
ax_sales = fig.add_subplot(gs[1,:2])
avg_sales = (new_df.groupby(col).Sales.mean().sort_values(ascending=False))#.head(5)
sns.barplot(x=avg_sales.values,y=avg_sales.index)
ax_sales.set_title('Average Revenue per purchase by {}'.format(col),)#fontdict=font_title)
ax_sales.set_xlabel('Revenue')
ax_profit = fig.add_subplot(gs[1,2:])
avg_profit = (new_df.groupby(col).Profit.mean().sort_values(ascending=False))#.head(5)
sns.barplot(x=avg_profit.values,y=avg_profit.index)
ax_profit.set_title('Average Profit per purchase by {}'.format(col),)#fontdict=font_title)
ax_profit.set_xlabel('Profit')
ax_profit.set_ylabel('')
plt.show()
sales_profit(col='Region')
Sales Contribution by Region Central 11117 South 6645 EMEA 5029 North 4785 Africa 4587 Oceania 3487 West 3203 Southeast Asia 3129 East 2848 North Asia 2338 Name: Region, dtype: int64
sales_profit(col='Category',filter_col='Region',filter_condition='Southeast Asia')
Sales Contribution by Category Office Supplies 1773 Furniture 687 Technology 669 Name: Category, dtype: int64
tot_sales_state = (df.groupby('State').Sales.sum().sort_values(ascending=False))
plt.figure(figsize=(12,4))
sns.barplot(x=tot_sales_state.head(),y=tot_sales_state.head().index)
plt.title('Top 5 Revenue contributing state')
plt.show()
tot_sales_city = (df.groupby('City').Sales.sum().sort_values(ascending=False))
plt.figure(figsize=(12,4))
sns.barplot(x=tot_sales_city.head(10),y=tot_sales_city.head(10).index)
plt.title('Top 10 Revenue contributing cities')
#plt.savefig('top_rev_city.png')
plt.show()
sales_city = (df.groupby('City').Sales.mean().sort_values(ascending=False))
profit_city = (df.groupby('City').Profit.mean().sort_values(ascending=False))
fig = plt.figure(figsize=(16, 4))
gs = fig.add_gridspec(nrows = 1, ncols = 2, width_ratios=(2, 2), left=0.05, right=0.95, wspace=0.28)
ax_rev = fig.add_subplot(gs[0, 0])
sns.barplot(x=sales_city.head(10),y=sales_city.head(10).index,ax=ax_rev)
ax_rev.set_title('Avg Revene per purchase by City')
ax_pro = fig.add_subplot(gs[0, 1])
sns.barplot(x=profit_city.head(10),y=profit_city.head(10).index,ax=ax_pro)
ax_pro.set_title('Avg Profit per purchase by City')
ax_pro.set_ylabel('')
plt.show()
sales_profit('Ship Mode')
Sales Contribution by Ship Mode Standard Class 30775 Second Class 10309 First Class 7505 Same Day 2701 Name: Ship Mode, dtype: int64
sales_profit('Segment')
Sales Contribution by Segment Consumer 26518 Corporate 15429 Home Office 9343 Name: Segment, dtype: int64
sales_profit('Category')
Sales Contribution by Category Office Supplies 31273 Technology 10141 Furniture 9876 Name: Category, dtype: int64
sales_profit(col='Sub-Category',filter_col='Category',filter_condition='Technology')
Sales Contribution by Sub-Category Phones 3357 Accessories 3075 Copiers 2223 Machines 1486 Name: Sub-Category, dtype: int64
import cufflinks
aggregated_df= df.groupby(pd.Grouper(key="Category")).aggregate(aggregated_value=("Quantity", "sum")).reset_index()
aggregated_df
| Category | aggregated_value | |
|---|---|---|
| 0 | Technology | 35176 |
| 1 | Furniture | 34954 |
| 2 | Office Supplies | 108182 |
fig = aggregated_df.iplot(kind="bar", asFigure=True,
x="Category", y="aggregated_value")
fig
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("your_report.html")